In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics import homogeneity_score

from scipy.cluster.hierarchy import linkage, dendrogram

np.set_printoptions(suppress=True, precision=5)


%matplotlib inline

In [2]:
X, y = make_blobs(n_samples = 150, n_features=2, 
                  centers=3, cluster_std=0.5, shuffle=True, random_state=0)

In [3]:
plt.scatter(X[:, 0], X[:, 1], c = "steelblue", marker = "o", s = 50)
plt.xlabel("X1")
plt.ylabel("X2")


Out[3]:
Text(0, 0.5, 'X2')

In [4]:
km = KMeans(n_clusters=3, init="random", n_init = 10, 
            max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)

In [5]:
def show_cluster(X, y, estimator = None, ignore_noise = True):
    levels = set(y)
    
    if ignore_noise and -1 in levels:
        levels.remove(-1)
    
    colors = sns.color_palette("husl", len(levels))
    centroids = None 
    if estimator is not None and hasattr(estimator, "cluster_centers_"):
        centroids = estimator.cluster_centers_  

    for k in levels:
        data = X[y == k, :]
        plt.scatter(data[:, 0], data[:, 1], color = colors[k], s = 50, label = "Cluster %s" % k)

    if not centroids is None:
        plt.scatter(centroids[:, 0], centroids[:, 1], color = "black", marker = "*", s = 150)

    plt.xlabel("X1")
    plt.ylabel("X2")
    plt.legend(loc = "lower left")
    
show_cluster(X, y_km, km)



In [6]:
km.cluster_centers_


Out[6]:
array([[ 0.93297,  4.35421],
       [ 2.06522,  0.96137],
       [-1.59473,  2.92237]])

In [7]:
#Sum of distances of samples to their closest cluster center.
print("Distortion (Within Cluster SSE): %.2f" % km.inertia_)


Distortion (Within Cluster SSE): 72.48

In [8]:
#Sum of distances of samples to their closest cluster center.
homogeneity_score(y, y_km)


Out[8]:
1.0

In [9]:
X, y = make_blobs(n_samples = 150, n_features=2, centers=3, 
                  cluster_std=1.0, shuffle=True, random_state=0)
km = KMeans(n_clusters=3, init="random", n_init = 10, 
            max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)
print("Homogeneity score: ", homogeneity_score(y, y_km), "Inertia: ", km.inertia_)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y, km)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_km, km)
plt.title("Estimated clusters")


Homogeneity score:  0.6812190697089529 Inertia:  262.72046565264066
Out[9]:
Text(0.5, 1.0, 'Estimated clusters')

Find optimal number of clusters using elbow method


In [10]:
def find_elbow(X, n = 10):
    distortions = []
    for i in range(1, n):
        km = KMeans(n_clusters=i, max_iter=300, n_init=10, random_state=0, init="k-means++")
        km.fit(X)
        distortions.append(km.inertia_)
    plt.plot(range(1, n), distortions)
    plt.xlabel("Number of clusters (K)")
    plt.ylabel("Distortion")

find_elbow(X)


Find number of clusters from Dendogram


In [11]:
plt.figure(figsize = (15, 10))
row_clusters = linkage(X, method="complete", metric="euclidean")
f = dendrogram(row_clusters)


Half Moon Dataset and DBSCAN


In [12]:
from sklearn.datasets import make_moons

In [13]:
X, y = make_moons(n_samples=200, noise=0.09, random_state=0)

In [14]:
plt.scatter(X[:, 0], X[:, 1], c = "steelblue", marker = "o", s = 50)
plt.xlabel("X1")
plt.ylabel("X2")


Out[14]:
Text(0, 0.5, 'X2')

In [15]:
km = KMeans(n_clusters=2, init="random", n_init = 10, max_iter = 300, tol = 1e-04, random_state=0)
y_km = km.fit_predict(X)
#show_cluster(km, X, y_km)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_km, km)
plt.title("Estimated clusters")


Out[15]:
Text(0.5, 1.0, 'Estimated clusters')

In [16]:
homogeneity_score(y, y_km)


Out[16]:
0.1810982320182603

In [17]:
dbscan = DBSCAN(eps=0.2, min_samples=10, metric="euclidean")
y_db = dbscan.fit_predict(X)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
show_cluster(X, y, dbscan)
plt.title("True Clusters")
plt.subplot(1, 2, 2)
show_cluster(X, y_db, dbscan)
plt.title("Estimated clusters")


Out[17]:
Text(0.5, 1.0, 'Estimated clusters')

In [18]:
labels = set(y_db)
if -1 in labels: #Noise
    labels.remove(-1)
print("No of clusters: ", len(labels))


No of clusters:  7

In [19]:
homogeneity_score(y, y_db)


Out[19]:
0.9395815602003369

Applying clustering to grouplens movies dataset based on genre


In [20]:
movies = pd.read_csv("/data/movielens/movies.csv", index_col="movieId")
movies.head()


Out[20]:
title genres
movieId
1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
2 Jumanji (1995) Adventure|Children|Fantasy
3 Grumpier Old Men (1995) Comedy|Romance
4 Waiting to Exhale (1995) Comedy|Drama|Romance
5 Father of the Bride Part II (1995) Comedy

In [21]:
movies.sample(10)


Out[21]:
title genres
movieId
2202 Lifeboat (1944) Drama|War
991 Michael Collins (1996) Drama
4863 Female Trouble (1975) Comedy|Crime
647 Courage Under Fire (1996) Action|Crime|Drama|War
85788 Insidious (2010) Fantasy|Horror|Thriller
963 Inspector General, The (1949) Musical
4570 Big Picture, The (1989) Comedy|Drama
55999 Mr. Magorium's Wonder Emporium (2007) Children|Comedy|Fantasy
640 Diabolique (1996) Drama|Thriller
1960 Last Emperor, The (1987) Drama

In [22]:
movies = movies[~movies["genres"].str.contains("\(no genres listed\)")]
movies.sample(10)


Out[22]:
title genres
movieId
3979 Little Nicky (2000) Comedy
5223 Pauline & Paulette (Pauline en Paulette) (2001) Comedy|Drama
51094 Gray Matters (2006) Comedy|Drama|Romance
2828 Dudley Do-Right (1999) Children|Comedy
80083 Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Or... Action|Adventure|Animation|Fantasy|Sci-Fi
113640 Canal, The (2014) Horror|Thriller
6299 Winged Migration (Peuple migrateur, Le) (2001) Documentary
1192 Paris Is Burning (1990) Documentary
68901 Chop Shop (2007) Drama
3032 Omega Man, The (1971) Action|Drama|Sci-Fi|Thriller

In [23]:
genres = set()
movies["genres"].apply(lambda g: genres.update(g.split(r"|")))
genres = list(genres)
genres.sort()
print(genres, len(genres))


['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] 19

In [24]:
def to_vector(g):
    indices = np.array([genres.index(v) for v in g.split(r"|")])
    l = np.zeros(len(genres))
    l[indices] = 1
    return l

genres_idx = movies["genres"].apply(to_vector)
genres_idx.head(10)


Out[24]:
movieId
1     [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...
2     [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
3     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...
5     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
6     [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
7     [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
8     [0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9     [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10    [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: genres, dtype: object

In [25]:
X = np.array(genres_idx.tolist())
print("X.shape: ", X.shape)


X.shape:  (9107, 19)

Normalize the data


In [26]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [27]:
plt.figure(figsize = (15, 10))
row_clusters = linkage(X_std, method="complete", metric="euclidean")
f = dendrogram(row_clusters, p = 5, truncate_mode="level")


To visualize the clusters lets apply PCA with 2 components.


In [28]:
from sklearn.decomposition import KernelPCA, PCA

In [29]:
pca = PCA(random_state=0)
X_pca = pca.fit_transform(X_std)

ratios = pca.explained_variance_ratio_
plt.bar(range(len(ratios)), ratios)
plt.step(range(len(ratios)), np.cumsum(ratios), 
         label = "Cumsum of Explained variance ratio")
plt.title("Explained variance")
plt.ylabel("Explained Variance Ratio")
plt.xlabel("Number of PCA components")


Out[29]:
Text(0.5, 0, 'Number of PCA components')

With 2 principle components havelow explained variance coverage.


In [30]:
pca = PCA(random_state=0, n_components=2)
X_pca = pca.fit_transform(X_std)

plt.figure(figsize = (15, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1])
plt.xlabel("PCA1")
plt.ylabel("PCA2")


Out[30]:
Text(0, 0.5, 'PCA2')

There is not visual indication of clusters from 2 PCA components which is consistent with the finding that explained variance with 2 components is only 2%


In [31]:
find_elbow(X_std, 40)



In [32]:
knn = KMeans(n_clusters=8, max_iter=300, random_state=0)
y_pred = knn.fit_predict(X_std)

For each observations, compute distance to the nearest cluster centroid.


In [33]:
def distance(p1, p2):
    p1, p2 = p1.flatten(), p2.flatten()
    return np.sqrt(np.sum((p1 - p2) ** 2))

distances = []
for i in range(X_std.shape[0]):
    p1 = X_std[i, :]
    cluster = knn.labels_[i]
    center = knn.cluster_centers_[cluster]
    distances.append(distance(p1, center))

movies["distance"] = np.array(distances)
movies.sort_values("distance", ascending=False)[:10]


Out[33]:
title genres distance
movieId
81132 Rubber (2010) Action|Adventure|Comedy|Crime|Drama|Film-Noir|... 10.008449
81847 Tangled (2010) Animation|Children|Comedy|Fantasy|Musical|Roma... 8.401892
83613 Cowboys & Aliens (2011) Action|Sci-Fi|Thriller|Western|IMAX 8.313656
595 Beauty and the Beast (1991) Animation|Children|Fantasy|Musical|Romance|IMAX 8.223587
2142 American Tail: Fievel Goes West, An (1991) Adventure|Animation|Children|Musical|Western 8.047143
103384 Lone Ranger, The (2013) Action|Adventure|Western|IMAX 7.930896
7374 Home on the Range (2004) Animation|Children|Comedy|Musical|Western 7.822317
364 Lion King, The (1994) Adventure|Animation|Children|Drama|Musical|IMAX 7.558606
3159 Fantasia 2000 (1999) Animation|Children|Musical|IMAX 7.416244
26701 Patlabor: The Movie (Kidô keisatsu patorebâ: T... Action|Animation|Crime|Drama|Film-Noir|Mystery... 7.396372

In [34]:
movies[y_pred == 3].sample(10)


Out[34]:
title genres distance
movieId
2367 King Kong (1976) Adventure|Fantasy|Romance|Sci-Fi|Thriller 6.439895
156607 The Huntsman Winter's War (2016) Action|Adventure|Drama|Fantasy 5.160459
51698 Last Mimzy, The (2007) Adventure|Children|Sci-Fi 5.026697
104419 Justice League: Crisis on Two Earths (2010) Action|Animation|Sci-Fi 5.577885
8 Tom and Huck (1995) Adventure|Children 4.047729
30810 Life Aquatic with Steve Zissou, The (2004) Adventure|Comedy|Fantasy 4.594276
99764 It's Such a Beautiful Day (2012) Animation|Comedy|Drama|Fantasy|Sci-Fi 5.681293
3287 Tigger Movie, The (2000) Animation|Children 3.966153
95311 Presto (2008) Animation|Children|Comedy|Fantasy 4.163322
919 Wizard of Oz, The (1939) Adventure|Children|Fantasy|Musical 5.874266

In [ ]: